library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.0
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(viridisLite)
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color=Species, shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color=Species, shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species") +theme_classic()
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="red", shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_manual(values=c("blue", "purple", "red"))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_brewer(palette="dark2")+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
## Warning: Unknown palette: "dark2"
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_viridis_d()+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
pdf("~/Bio 676/Lab9/images/iris_example_plot1.pdf", width=6, height=3)
ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width")
dev.off()
## png
## 2
ppi <- 300
png("images/iris_example_plot2.png", width=6*ppi, height=4*ppi, res=ppi)
ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()
dev.off()
## png
## 2
Iris example plot
ggplotly(ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point())
p <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()
ggplotly(p)
#Format data set
NEON_MAGs <- read_csv("~/Bio 676/Lab9/data/GOLD_Study_ID_Gs0161344_NEON.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 29 rows [12, 32, 66, 79, 80,
## 88, 96, 102, 104, 240, 334, 386, 657, 790, 846, 931, 943, 983, 1041, 1095,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 429 rows [6, 7, 42, 49,
## 50, 55, 60, 83, 85, 97, 100, 105, 107, 113, 114, 116, 119, 125, 129, 130, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_MAGs_bact_ind <- NEON_MAGs %>%
filter(Domain=="Bacteria") %>%
filter(`Assembly Type`=="Individual")
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar()+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum)))+geom_bar()+coord_flip() #order by count
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x=Phylum, y=n))+geom_col(stat="identity")+coord_flip()
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x=reorder(Phylum, n), y=n))+geom_col(stat="identity")+coord_flip()
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar()+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar(position="dodge")+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar(position=position_dodge2(width=0.9, preserve= "single"))+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar(position=position_dodge2(width=0.9, preserve="single"))+coord_flip()+facet_wrap(vars(Site), scales="free_y", ncol=2)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=`Total Number of Bases`))+geom_histogram(bins=50)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum), y=`Total Number of Bases`))+geom_boxplot()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum), y=`Total Number of Bases`))+geom_point()+coord_flip()
What are the overall class MAG counts?
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Class)), fill=Class))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Overall MAG Count per Class", x="Class", y="MAG Count")+theme_classic()
What are the MAG counts for each subplot. Color by site ID.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Subplot)), fill=`Site ID`))+geom_bar()+coord_flip()+labs(title="Overall MAG Count per Subplot", x="Subplot ID", y="MAG Count")+theme_classic()
How many novel bacteria were discovered (Show that number of NAs for each taxonomic level)?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_rev(fct_infreq(Site)), fill=Site))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Total Novel Bacteria per Site", x="Site", y="n")+theme_classic()
How many novel bacterial MAGs are high quality vs medium quality?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_infreq(`Bin Quality`), fill=`Bin Quality`))+geom_bar()+labs(title="Novel Bacteria MAG Quality", x="Quality", y="MAGs")+theme_classic()
What phyla have novel bacterial genera?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_infreq(`Phylum`), fill=`Phylum`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Novel Genera per Phylum", x="Phylum", y="n")+theme_classic()
Make a stacked bar plot of the total number of MAGs at each site using Phylum as the fill.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Site)), fill=Phylum))+geom_bar()+coord_flip()+labs(title="Total Number of MAGs per site", x="Site", y="MAG Count",fill="Phylum")+theme_classic()
## Exercise #7
Using facet_wrap make plots of the total number of MAGs at each site for each phylum (e.g. similar to the example above but using the site ID and separating each graph by phylum.)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar(aes(fill=Phylum),position=position_dodge2(width=0.9, preserve="single"),show.legend=FALSE)+coord_flip()+facet_wrap(vars(`Site ID`), scales="free_y", ncol=4)+labs(title="Total Number of Mags per Phylum at Each Site", x="Phylum", y="n")+theme_classic()
What is the relationship between MAGs genome size and the number of genes? Color by Phylum.
NEON_MAGs_bact_ind %>%
mutate(`Estimated Genome Size (Kbp)`=as.integer(`Total Number of Bases`/(`Bin Completeness`/100)/1000)) %>%
ggplot(aes(x=`Gene Count`, y=`Estimated Genome Size (Kbp)`, color=Phylum))+geom_point()+labs(title="Gene Count vs Estimated Mag Genome Size", x="Gene", y="Estimated Genome Size (Kbp)")+theme_classic()
What is the relationship between scaffold count and MAG completeness?
NEON_MAGs_bact_ind %>%
ggplot(aes(x=`Scaffold Count`, y=`Bin Completeness`, color=Phylum))+geom_point()+labs(title="Scaffold Count vs MAG Completeness", x="Scaffold Count", y="MAG Completeness")+theme_classic()